{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Language Detection"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<div class=\"alert alert-info\">\n",
    "\n",
    "This tutorial is available as an IPython notebook at [Malaya/example/language-detection](https://github.com/huseinzol05/Malaya/tree/master/example/language-detection).\n",
    "    \n",
    "</div>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<div class=\"alert alert-info\">\n",
    "\n",
    "This module trained on both standard and local (included social media) language structures, so it is save to use for both.\n",
    "    \n",
    "</div>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 3.13 s, sys: 2.83 s, total: 5.96 s\n",
      "Wall time: 2.71 s\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/husein/dev/malaya/malaya/tokenizer.py:214: FutureWarning: Possible nested set at position 3397\n",
      "  self.tok = re.compile(r'({})'.format('|'.join(pipeline)))\n",
      "/home/husein/dev/malaya/malaya/tokenizer.py:214: FutureWarning: Possible nested set at position 3927\n",
      "  self.tok = re.compile(r'({})'.format('|'.join(pipeline)))\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "import malaya\n",
    "import fasttext"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### labels supported\n",
    "\n",
    "Default labels for language detection module."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "deep-model \n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "         eng    0.96760   0.97401   0.97080    553739\n",
      "         ind    0.97635   0.96131   0.96877    576059\n",
      "       malay    0.96985   0.98498   0.97736   1800649\n",
      "    manglish    0.98036   0.96569   0.97297    181442\n",
      "       other    0.99641   0.99627   0.99634   1428083\n",
      "       rojak    0.94221   0.84302   0.88986    189678\n",
      "\n",
      "    accuracy                        0.97779   4729650\n",
      "   macro avg    0.97213   0.95421   0.96268   4729650\n",
      "weighted avg    0.97769   0.97779   0.97760   4729650\n",
      "\n",
      "mesolitica/fasttext-language-detection-v1 \n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "         eng    0.94014   0.96750   0.95362    553739\n",
      "         ind    0.97290   0.97316   0.97303    576059\n",
      "       malay    0.98674   0.95262   0.96938   1800649\n",
      "    manglish    0.96595   0.98417   0.97498    181442\n",
      "       other    0.98454   0.99698   0.99072   1428083\n",
      "       rojak    0.81149   0.91650   0.86080    189678\n",
      "\n",
      "    accuracy                        0.97002   4729650\n",
      "   macro avg    0.94363   0.96515   0.95375   4729650\n",
      "weighted avg    0.97111   0.97002   0.97028   4729650\n",
      "\n",
      "mesolitica/fasttext-language-detection-v2 \n",
      "                        precision    recall  f1-score   support\n",
      "\n",
      "         local-english    0.88328   0.87926   0.88127     50429\n",
      "           local-malay    0.93159   0.92648   0.92903     59877\n",
      "        local-mandarin    0.62000   0.95044   0.75045     49820\n",
      "              manglish    0.98494   0.98157   0.98325     49648\n",
      "                 other    0.99168   0.92850   0.95905     64350\n",
      "socialmedia-indonesian    0.97626   0.95390   0.96495     75140\n",
      "      standard-english    0.86918   0.88018   0.87465     49776\n",
      "   standard-indonesian    0.99695   0.99713   0.99704     50148\n",
      "        standard-malay    0.92292   0.94851   0.93554     50049\n",
      "     standard-mandarin    0.90855   0.53587   0.67413     53709\n",
      "\n",
      "              accuracy                        0.89953    552946\n",
      "             macro avg    0.90853   0.89818   0.89494    552946\n",
      "          weighted avg    0.91425   0.89953   0.89893    552946\n",
      "\n",
      "mesolitica/fasttext-language-detection-ms-id \n",
      "                        precision    recall  f1-score   support\n",
      "\n",
      "           local-malay    0.95063   0.93858   0.94457    199961\n",
      "                 other    0.97145   0.98889   0.98009    125920\n",
      "socialmedia-indonesian    0.97923   0.96303   0.97106    213486\n",
      "   standard-indonesian    0.99119   0.99610   0.99364    149055\n",
      "        standard-malay    0.93743   0.95669   0.94696    149336\n",
      "\n",
      "              accuracy                        0.96584    837758\n",
      "             macro avg    0.96599   0.96866   0.96727    837758\n",
      "          weighted avg    0.96591   0.96584   0.96582    837758\n",
      "\n",
      "mesolitica/fasttext-language-detection-en \n",
      "                  precision    recall  f1-score   support\n",
      "\n",
      "   local-english    0.88991   0.89457   0.89223    149823\n",
      "        manglish    0.98619   0.98479   0.98549    149535\n",
      "           other    0.99439   0.99268   0.99354    140651\n",
      "standard-english    0.89162   0.88967   0.89064    150703\n",
      "\n",
      "        accuracy                        0.93952    590712\n",
      "       macro avg    0.94053   0.94043   0.94047    590712\n",
      "    weighted avg    0.93960   0.93952   0.93955    590712\n",
      "\n"
     ]
    }
   ],
   "source": [
    "for k, v in malaya.language_detection.metrics.items():\n",
    "    print(k, v)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Different models support different languages."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## List available language detection models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'mesolitica/fasttext-language-detection-v1': {'Size (MB)': 353,\n",
       "  'Quantized Size (MB)': 31.1,\n",
       "  'dim': 16,\n",
       "  'Label': {0: 'eng',\n",
       "   1: 'ind',\n",
       "   2: 'malay',\n",
       "   3: 'manglish',\n",
       "   4: 'other',\n",
       "   5: 'rojak'}},\n",
       " 'mesolitica/fasttext-language-detection-v2': {'Size (MB)': 1840,\n",
       "  'Quantized Size (MB)': 227,\n",
       "  'dim': 16,\n",
       "  'Label': {0: 'standard-english',\n",
       "   1: 'local-english',\n",
       "   2: 'manglish',\n",
       "   3: 'standard-indonesian',\n",
       "   4: 'socialmedia-indonesian',\n",
       "   5: 'standard-malay',\n",
       "   6: 'local-malay',\n",
       "   7: 'standard-mandarin',\n",
       "   8: 'local-mandarin',\n",
       "   9: 'other'}},\n",
       " 'mesolitica/fasttext-language-detection-ms-id': {'Size (MB)': 537,\n",
       "  'Quantized Size (MB)': 62.5,\n",
       "  'dim': 16,\n",
       "  'Label': {0: 'standard-indonesian',\n",
       "   1: 'socialmedia-indonesian',\n",
       "   2: 'standard-malay',\n",
       "   3: 'local-malay',\n",
       "   4: 'other'}},\n",
       " 'mesolitica/fasttext-language-detection-bahasa-en': {'Size (MB)': 537,\n",
       "  'Quantized Size (MB)': 62.5,\n",
       "  'dim': 16,\n",
       "  'Label': {0: 'bahasa', 1: 'english', 2: 'other'}},\n",
       " 'mesolitica/fasttext-language-detection-en': {'Size (MB)': 383,\n",
       "  'Quantized Size (MB)': 42.3,\n",
       "  'dim': 16,\n",
       "  'Label': {0: 'standard-english',\n",
       "   1: 'local-english',\n",
       "   2: 'manglish',\n",
       "   3: 'other'}}}"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "malaya.language_detection.available_fasttext"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "chinese_text = '今天是６月１８号，也是Muiriel的生日！'\n",
    "english_text = 'i totally love it man'\n",
    "indon_text = 'menjabat saleh perombakan menjabat periode komisi energi fraksi partai pengurus partai periode periode partai terpilih periode menjabat komisi perdagangan investasi persatuan periode'\n",
    "malay_text = 'beliau berkata program Inisitif Peduli Rakyat (IPR) yang diperkenalkan oleh kerajaan negeri Selangor lebih besar sumbangannya'\n",
    "socialmedia_malay_text = 'nti aku tengok dulu tiket dari kl pukul berapa ada nahh'\n",
    "socialmedia_indon_text = 'saking kangen papanya pas vc anakku nangis'\n",
    "rojak_text = 'jadi aku tadi bikin ini gengs dan dijual haha salad only k dan haha drinks only k'\n",
    "manglish_text = 'power lah even shopback come to edmw riao'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Load Fast-text model\n",
    "\n",
    "Make sure fast-text already installed, if not, simply,\n",
    "\n",
    "```bash\n",
    "pip install fasttext\n",
    "```\n",
    "\n",
    "```python\n",
    "def fasttext(quantized: bool = True, **kwargs):\n",
    "\n",
    "    \"\"\"\n",
    "    Load Fasttext language detection model.\n",
    "    Original size is 353MB, Quantized size 31.1MB.\n",
    "    \n",
    "    Parameters\n",
    "    ----------\n",
    "    quantized: bool, optional (default=True)\n",
    "        if True, load quantized fasttext model. Else, load original fasttext model.\n",
    "\n",
    "    Returns\n",
    "    -------\n",
    "    result : malaya.model.ml.LanguageDetection class\n",
    "    \"\"\"\n",
    "```"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In this example, I am going to compare with pretrained fasttext from Facebook. https://fasttext.cc/docs/en/language-identification.html\n",
    "\n",
    "Simply download pretrained model,\n",
    "\n",
    "```bash\n",
    "wget https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !wget https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = fasttext.load_model('lid.176.ftz')\n",
    "fast_text = malaya.language_detection.fasttext()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Language detection in Malaya is not trying to tackle possible languages in this world, just towards to hyperlocal language.**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "([['__label__id']], [array([0.6334154], dtype=float32)])"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.predict(['suka makan ayam dan daging'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'standard-english': 0.0,\n",
       "  'local-english': 0.0,\n",
       "  'manglish': 0.0,\n",
       "  'standard-indonesian': 0.0,\n",
       "  'socialmedia-indonesian': 0.0,\n",
       "  'standard-malay': 0.50445783,\n",
       "  'local-malay': 0.0,\n",
       "  'standard-mandarin': 0.0,\n",
       "  'local-mandarin': 0.0,\n",
       "  'other': 0.0}]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fast_text.predict_proba(['suka makan ayam dan daging'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(('__label__ms',), array([0.57101035]))"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.predict(malay_text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'standard-english': 0.0,\n",
       "  'local-english': 0.0,\n",
       "  'manglish': 0.0,\n",
       "  'standard-indonesian': 0.0,\n",
       "  'socialmedia-indonesian': 0.0,\n",
       "  'standard-malay': 0.9099521,\n",
       "  'local-malay': 0.0,\n",
       "  'standard-mandarin': 0.0,\n",
       "  'local-mandarin': 0.0,\n",
       "  'other': 0.0}]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fast_text.predict_proba([malay_text])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(('__label__id',), array([0.7870034]))"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.predict(socialmedia_malay_text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'standard-english': 0.0,\n",
       "  'local-english': 0.0,\n",
       "  'manglish': 0.0,\n",
       "  'standard-indonesian': 0.0,\n",
       "  'socialmedia-indonesian': 0.0,\n",
       "  'standard-malay': 0.0,\n",
       "  'local-malay': 0.9976433,\n",
       "  'standard-mandarin': 0.0,\n",
       "  'local-mandarin': 0.0,\n",
       "  'other': 0.0}]"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fast_text.predict_proba([socialmedia_malay_text])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(('__label__fr',), array([0.2912012]))"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.predict(socialmedia_indon_text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'standard-english': 0.0,\n",
       "  'local-english': 0.0,\n",
       "  'manglish': 0.0,\n",
       "  'standard-indonesian': 0.0,\n",
       "  'socialmedia-indonesian': 1.00003,\n",
       "  'standard-malay': 0.0,\n",
       "  'local-malay': 0.0,\n",
       "  'standard-mandarin': 0.0,\n",
       "  'local-mandarin': 0.0,\n",
       "  'other': 0.0}]"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fast_text.predict_proba([socialmedia_indon_text])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(('__label__id',), array([0.87948251]))"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.predict(rojak_text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'standard-english': 0.0,\n",
       "  'local-english': 0.0,\n",
       "  'manglish': 0.0,\n",
       "  'standard-indonesian': 0.0,\n",
       "  'socialmedia-indonesian': 0.0,\n",
       "  'standard-malay': 0.0,\n",
       "  'local-malay': 0.9569701,\n",
       "  'standard-mandarin': 0.0,\n",
       "  'local-mandarin': 0.0,\n",
       "  'other': 0.0}]"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fast_text.predict_proba([rojak_text])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(('__label__en',), array([0.89707506]))"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.predict(manglish_text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'standard-english': 0.0,\n",
       "  'local-english': 0.0,\n",
       "  'manglish': 0.99997073,\n",
       "  'standard-indonesian': 0.0,\n",
       "  'socialmedia-indonesian': 0.0,\n",
       "  'standard-malay': 0.0,\n",
       "  'local-malay': 0.0,\n",
       "  'standard-mandarin': 0.0,\n",
       "  'local-mandarin': 0.0,\n",
       "  'other': 0.0}]"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fast_text.predict_proba([manglish_text])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(('__label__zh',), array([0.97311586]))"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.predict(chinese_text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'standard-english': 0.0,\n",
       "  'local-english': 0.0,\n",
       "  'manglish': 0.0,\n",
       "  'standard-indonesian': 0.0,\n",
       "  'socialmedia-indonesian': 0.0,\n",
       "  'standard-malay': 0.0,\n",
       "  'local-malay': 0.0,\n",
       "  'standard-mandarin': 0.0,\n",
       "  'local-mandarin': 0.5823944,\n",
       "  'other': 0.0}]"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fast_text.predict_proba([chinese_text])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'standard-english': 0.0,\n",
       "  'local-english': 0.0,\n",
       "  'manglish': 0.0,\n",
       "  'standard-indonesian': 0.9755073,\n",
       "  'socialmedia-indonesian': 0.0,\n",
       "  'standard-malay': 0.0,\n",
       "  'local-malay': 0.0,\n",
       "  'standard-mandarin': 0.0,\n",
       "  'local-mandarin': 0.0,\n",
       "  'other': 0.0},\n",
       " {'standard-english': 0.0,\n",
       "  'local-english': 0.0,\n",
       "  'manglish': 0.0,\n",
       "  'standard-indonesian': 0.0,\n",
       "  'socialmedia-indonesian': 0.0,\n",
       "  'standard-malay': 0.9099521,\n",
       "  'local-malay': 0.0,\n",
       "  'standard-mandarin': 0.0,\n",
       "  'local-mandarin': 0.0,\n",
       "  'other': 0.0}]"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fast_text.predict_proba([indon_text,malay_text])"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
